library(mosaic)
library(tidyverse)
library(lubridate)
library(DataComputing)
library(rvest)
library(broom)
As COVID-19 spreads at an alarming rate, a pressing question at a global scale emerges– what factors of a country contribute to the spread of Coronavirus. We hope to analyze the relationship between a country’s population level, population density, and continent categorization on the spread of COVID-19.
COVID <- read.csv(file = "total-covid-cases-deaths-per-million.csv")
COVID
COVID %>%
nrow()
[1] 9487
COVID %>%
names()
[1] "total.covid.cases.deaths.per.million" "X" "X.1"
[4] "X.2" "X.3" "X.4"
[7] "X.5" "X.6" "X.7"
[10] "X.8" "X.9" "X.10"
[13] "X.11" "X.12" "X.13"
[16] "X.14" "X.15" "X.16"
[19] "X.17" "X.18" "X.19"
[22] "X.20" "X.21" "X.22"
[25] "X.23" "X.24" "X.25"
[28] "X.26" "X.27" "X.28"
[31] "X.29" "X.30" "X.31"
[34] "X.32" "X.33" "X.34"
[37] "X.35" "X.36" "X.37"
[40] "X.38" "X.39" "X.40"
[43] "X.41" "X.42" "X.43"
[46] "X.44" "X.45" "X.46"
[49] "X.47" "X.48" "X.49"
[52] "X.50" "X.51" "X.52"
[55] "X.53" "X.54" "X.55"
[58] "X.56" "X.57" "X.58"
[61] "X.59" "X.60" "X.61"
[64] "X.62" "X.63" "X.64"
[67] "X.65" "X.66" "X.67"
[70] "X.68" "X.69" "X.70"
[73] "X.71" "X.72" "X.73"
[76] "X.74" "X.75" "X.76"
[79] "X.77" "X.78" "X.79"
[82] "X.80" "X.81" "X.82"
[85] "X.83" "X.84" "X.85"
[88] "X.86" "X.87" "X.88"
[91] "X.89" "X.90" "X.91"
[94] "X.92" "X.93" "X.94"
[97] "X.95" "X.96" "X.97"
[100] "X.98" "X.99" "X.100"
[103] "X.101" "X.102" "X.103"
[106] "X.104" "X.105" "X.106"
[109] "X.107" "X.108" "X.109"
[112] "X.110" "X.111" "X.112"
[115] "X.113" "X.114" "X.115"
[118] "X.116" "X.117" "X.118"
[121] "X.119" "X.120" "X.121"
[124] "X.122" "X.123" "X.124"
[127] "X.125" "X.126" "X.127"
[130] "X.128" "X.129" "X.130"
[133] "X.131" "X.132" "X.133"
[136] "X.134" "X.135" "X.136"
[139] "X.137" "X.138" "X.139"
[142] "X.140" "X.141" "X.142"
[145] "X.143" "X.144" "X.145"
[148] "X.146" "X.147" "X.148"
[151] "X.149" "X.150" "X.151"
[154] "X.152" "X.153" "X.154"
[157] "X.155" "X.156" "X.157"
[160] "X.158" "X.159" "X.160"
[163] "X.161" "X.162" "X.163"
[166] "X.164" "X.165" "X.166"
[169] "X.167" "X.168" "X.169"
[172] "X.170" "X.171" "X.172"
[175] "X.173" "X.174" "X.175"
[178] "X.176" "X.177" "X.178"
[181] "X.179" "X.180" "X.181"
[184] "X.182" "X.183" "X.184"
[187] "X.185" "X.186" "X.187"
[190] "X.188" "X.189" "X.190"
[193] "X.191" "X.192" "X.193"
[196] "X.194" "X.195" "X.196"
[199] "X.197" "X.198" "X.199"
[202] "X.200" "X.201" "X.202"
[205] "X.203" "X.204" "X.205"
[208] "X.206" "X.207" "X.208"
[211] "X.209" "X.210" "X.211"
[214] "X.212" "X.213" "X.214"
[217] "X.215" "X.216" "X.217"
[220] "X.218" "X.219" "X.220"
[223] "X.221" "X.222" "X.223"
[226] "X.224" "X.225" "X.226"
[229] "X.227" "X.228" "X.229"
[232] "X.230" "X.231" "X.232"
[235] "X.233" "X.234" "X.235"
[238] "X.236" "X.237" "X.238"
[241] "X.239" "X.240" "X.241"
[244] "X.242" "X.243" "X.244"
[247] "X.245" "X.246" "X.247"
[250] "X.248" "X.249" "X.250"
[253] "X.251" "X.252" "X.253"
[256] "X.254"
COVID %>%
head()
CountryData
CountryData %>%
nrow()
[1] 256
CountryData %>%
names()
[1] "country" "area" "pop" "growth" "birth" "death"
[7] "migr" "maternal" "infant" "life" "fert" "health"
[13] "HIVrate" "HIVpeople" "HIVdeath" "obesity" "underweight" "educ"
[19] "unemploymentYouth" "GDP" "GDPgrowth" "GDPcapita" "saving" "indProd"
[25] "labor" "unemployment" "family" "tax" "budget" "debt"
[31] "inflation" "discount" "lending" "narrow" "broad" "credit"
[37] "shares" "balance" "exports" "imports" "gold" "externalDebt"
[43] "homeStock" "abroadStock" "elecProd" "elecCons" "elecExp" "elecImp"
[49] "elecCap" "elecFossil" "elecNuc" "elecHydro" "elecRenew" "oilProd"
[55] "oilExp" "oilImp" "oilRes" "petroProd" "petroCons" "petroExp"
[61] "petroImp" "gasProd" "gasCons" "gasExp" "gasImp" "gasRes"
[67] "mainlines" "cell" "netHosts" "netUsers" "airports" "railways"
[73] "roadways" "waterways" "marine" "military"
CountryData %>%
head()
countryRegions
countryRegions %>%
nrow()
[1] 254
countryRegions %>%
names()
[1] "ISO3" "ADMIN" "REGION" "continent" "GEO3major" "GEO3" "IMAGE24" "GLOCAF"
[9] "Stern" "SRESmajor" "SRES" "GBD" "AVOIDnumeric" "AVOIDname" "LDC" "SID"
[17] "LLDC"
countryRegions %>%
head()
COVID
Since our analysis is focused on the spread of COVID-19, we select only columns which pertain to the number of COVID-19 cases in countries over time.
TidyCOVID <- COVID %>%
rename(country = total.covid.cases.deaths.per.million ) %>%
rename( Code = X ) %>%
rename(date = X.1 ) %>%
rename(casesPerMillion = X.3) %>%
filter(row_number() > 1) %>%
subset(select = c(1,2,3,5)) %>%
mutate( country = as.character(country) ) %>%
mutate(date = mdy(date)) %>%
mutate(casesPerMillion = as.integer(casesPerMillion) - 1)
TidyCOVID
EVELYN pls explain what an instance represents
We will extract the ISO3 country code and continent from the countryRegions data. Since naming conventions of countries is variate, the ISO3 country code allows us a standardized demarcation of country with which to join with other data tables.
Labels <-
countryRegions %>%
subset(select = c("ISO3", "REGION")) %>%
rename(continent = REGION)
Labels
We will select the aspects of CountryData relevant to our analysis. These attributes are: area (sq km) and pop (number of people).
RelevantCountryData <-
CountryData %>%
subset(select = c(1,2,3)) %>%
mutate(popdensity = pop/area)
RelevantCountryData
Calculate the number of cases in each country by multiplying casesPerMillion by the country’s population (in millions).
COVIDGrowth <-
inner_join(TidyCOVID, RelevantCountryData, by = c("country")) %>%
mutate("cases" = (casesPerMillion * round(pop/1000000, digits = 0)))
COVIDGrowth <-
COVIDGrowth %>%
left_join(Labels, by = c("Code" = "ISO3"))
Column `Code`/`ISO3` joining factor and character vector, coercing into character vector
COVIDGrowth
This table records the first date that a country recorded a nonzero number of COVID-19 cases. This datagraph will help us visualize when countries first became infected.
FirstInstance <-
COVIDGrowth %>%
filter(cases != 0) %>%
group_by(country, continent) %>%
summarise(beginningofspread = min(date))
FirstInstance
This table averages the number of case increase per day from the first day a country had COVID-19 to the most recent in the data table (April 5 2020)
DailySpread <-
left_join(COVIDGrowth, FirstInstance, by = c("country")) %>%
filter(date == "2020-04-05") %>%
mutate(dayselapsed = date - beginningofspread) %>%
mutate(dailyspread = cases / as.numeric(dayselapsed) ) %>%
mutate(dailyspreadpermillion = casesPerMillion / as.numeric(dayselapsed) ) %>%
subset(select = c("country", "beginningofspread", "dailyspread", "dailyspreadpermillion"))
DailySpread$dailyspread[is.na(DailySpread$dailyspread)] <- 0
DailySpread$dailyspreadpermillion[is.na(DailySpread$dailyspreadpermillion)] <- 0
DailySpread
COVIDFinal <-
left_join(COVIDGrowth, DailySpread, by = c("country"))
COVIDFinal
COVIDFinal %>%
group_by(date) %>%
summarise(totalcases = sum(cases)) %>%
ggplot(aes(x = date, y = totalcases)) +
geom_point() +
xlab("Date") +
ylab("COVID-19 Cases")
na.omit(COVIDFinal) %>%
group_by(date, continent) %>%
summarise(totalcases = sum(cases)) %>%
ggplot(aes(x = date, y = totalcases)) +
geom_point() +
facet_wrap(~continent) +
xlab("Date") +
ylab("COVID-19 Cases")
na.omit(FirstInstance) %>%
ggplot(aes(x = beginningofspread, fill = continent)) +
geom_dotplot(stackgroups = TRUE, binwidth = 1, binpositions="all") +
xlab("Country's First Case of COVID-19") +
theme(panel.background = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.y = element_blank())
COVIDFinal %>%
group_by(country) %>%
summarise(dailyspread = mean(dailyspread)) %>%
arrange(desc(dailyspread)) %>%
head(20) %>%
ggplot(aes(x = reorder(country, desc(dailyspread)), y= dailyspread)) +
geom_bar(stat="identity", position = 'stack', width=.9) +
theme(axis.text.x=element_text(angle = 60, hjust = 1)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
ylab("Average Number Infected Per Day") +
theme(axis.title.x = element_blank())
COVIDFinal %>%
group_by(country) %>%
summarise(pop = mean(pop)) %>%
arrange(desc(pop)) %>%
head(20) %>%
ggplot(aes(x = reorder(country, desc(pop)), y= pop)) +
geom_bar(stat="identity", position = 'stack', width=.9) +
theme(axis.text.x=element_text(angle = 60, hjust = 1)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
ylab("Population") +
theme(axis.title.x = element_blank())
na.omit(COVIDFinal) %>%
ggplot(aes(x = pop, y = dailyspread, color = continent)) +
geom_point() +
xlab("Population of Country") +
ylab("Average Number Infected Per Day")
na.omit(COVIDFinal) %>%
ggplot(aes(x = pop, y = dailyspread, color = continent)) +
geom_point() +
xlim(0,500000000) +
ylim(0, 40000) +
xlab("Population of Country") +
ylab("Average Number Infected Per Day") +
stat_smooth(method = lm)
COVIDFinal %>%
group_by(country) %>%
summarise(dailyspreadpermillion = mean(dailyspreadpermillion)) %>%
arrange(desc(dailyspreadpermillion)) %>%
head(20) %>%
ggplot(aes(x = reorder(country, desc(dailyspreadpermillion)), y= dailyspreadpermillion)) +
geom_bar(stat="identity", position = 'stack', width=.9) +
theme(axis.text.x=element_text(angle = 60, hjust = 1)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
ylab("Population Per Million Infected Per Day") +
theme(axis.title.x = element_blank())
COVIDFinal %>%
group_by(country) %>%
summarise(popdensity = mean(popdensity)) %>%
arrange(desc(popdensity)) %>%
head(20) %>%
ggplot(aes(x = reorder(country, desc(popdensity)), y= popdensity)) +
geom_bar(stat="identity", position = 'stack', width=.9) +
theme(axis.text.x=element_text(angle = 60, hjust = 1)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
ylab("Population Density (people/sq km)") +
theme(axis.title.x = element_blank())
na.omit(COVIDFinal) %>%
ggplot(aes(x = popdensity, y = dailyspreadpermillion)) +
geom_point()
na.omit(COVIDFinal) %>%
ggplot(aes(x = popdensity, y = dailyspreadpermillion)) +
geom_point() +
facet_wrap(~continent) +
xlim(0,1500)
WideCountries <-
COVIDFinal %>%
subset(select = c("country", "date", "cases")) %>%
spread(key = date, value = cases)
WideCountries[is.na(WideCountries)] <- 0
WideCountries
compareCOVID <- function(countryA, countryB) {
A <-
WideCountries %>%
filter(country == countryA)
B <-
WideCountries %>%
filter(country == countryB)
A <-
A %>%
gather(key = date, value = count) %>%
filter(row_number() > 1) %>%
mutate(date = lubridate::ymd(date)) %>%
mutate(count = as.numeric(count)) %>%
mutate(country = countryA)
B <-
B %>%
gather(key = date, value = count) %>%
filter(row_number() > 1) %>%
mutate(date = lubridate::ymd(date))%>%
mutate(count = as.numeric(count)) %>%
mutate(country = countryB)
GG <-
rbind(A,B)
return( ggplot(GG, aes(x = date, y = count, color = country)) +
stat_smooth(formula = y ~ x, method = "loess") +
ylab("Number of COVID-19 Cases") +
xlab("Date"))
}
compareCOVID("China", "United States")
compareCOVID("Japan", "Russia")
compareCOVID("Puerto Rico", "Belgium")